Chapter 03 - visualizations

matplotlib, pg 73



In [5]:

    
from matplotlib import pyplot as plt



In [9]:

    
years = [x for x in range(1950, 2011, 10)]
gdp = [300.2, 543.3, 1075.9, 2862.5, 5979.6, 10289.7, 14958.3]

# create a line chart for years on x-axis and gdp on y-axis
plt.plot(years, gdp, color = 'green', marker = 'o', linestyle = 'solid')
plt.title("Nomial GDP") # add title
plt.ylabel("Billions of $") # add y-axis label
plt.show()

Bar Charts, pg 75



In [19]:

    
movies = ['Annie Hall', 'Ben-Hur', 'Casablanca', 'Ghandi', 'West Side Story']
num_oscars = [5, 11, 3, 8, 10]

# x-axis
# bars are default width of 0.8, so add 0.1 to left coordinates
# to center each bar
xs = [i + 0.1 for i, _ in enumerate(movies)]

# plot bars
plt.bar(xs, num_oscars)
plt.ylabel('Number of Oscars')
plt.title('My Favorite Movies')
# label x-axis with movie names, need to be shifted right by 0.1
plt.xticks([i + 0.1 for i, _ in enumerate(movies)], movies)

plt.show()

Also good for distributions, pg 76



In [25]:

    
from collections import Counter
grades = [83, 95, 91, 87, 70, 0, 85, 82, 100, 67, 73, 77, 0]
decile = lambda grade: grade // 10 * 10
histogram = Counter(decile(grade) for grade in grades)

plt.bar([x-4 for x in histogram.keys()], # shift each bar to left by 4
       histogram.values(), # give each bar correct height
       8) # set bar width to 8

plt.axis([-5, 105, 0, 5]) # x-axis from -5..105, and y from 0..5

plt.xticks([10 * i for i in range(11)]) # x-axis labels
plt.xlabel("Decile")
plt.ylabel("# of Students")
plt.title("Distribution of Exam 1 Grades")

plt.show()

Line Charts, pg 79



In [26]:

    
variance = [1, 2, 4, 8, 16, 32, 64, 128, 256]
bias_squared = [256, 128, 64, 32, 16, 8, 4, 2, 1]
total_error = [x + y for x, y in zip(variance, bias_squared)]
xs = [i for i, _ in enumerate(variance)]

# can make multiple calls to the plt.plot
plt.plot(xs, variance, 'g-', label = 'variance') # green solid line
plt.plot(xs, bias_squared, 'r-.', label = 'bias^2') # red dot-dash line
plt.plot(xs, total_error, 'b:', label = 'total error') # blue dotted line

# because we assigned labels to each series
# we can get a legend
# loc = 9 means top center
plt.legend(loc = 9)
plt.xlabel("model complexity")
plt.title('The Bias-Variance Tradeoff')

plt.show()

Scatterplots, pg 80



In [27]:

    
friends = [70, 65, 72, 63, 71, 64, 60, 64, 67]
minutes = [175, 170, 205, 120, 220, 130, 105, 145, 100]
labels = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i']

plt.scatter(friends, minutes)

# label each point
for label, friend_count, minute_count in zip(labels, friends, minutes):
    plt.annotate(label,
                 xy = (friend_count, minute_count), # put the label with point
                 xytext = (5, -5), # slightly offset
                 textcoords = 'offset points'
                )
    
plt.title('Daily Minutes vs. Number of Friends')
plt.xlabel('# of Friends')
plt.ylabel('Daily minutes spent on site')

plt.show()



In [28]:

    
# more scatter plots, pg 81
# python could choose skewed axis scales
# with comparable variables
test_1_grades = [99, 90, 85, 97, 80]
test_2_grades = [100, 85, 60, 90, 70]

plt.scatter(test_1_grades, test_1_grades)
plt.title('Axes Are not comparable')
plt.xlabel('test 1 grades')
plt.ylabel('test 2 grades')

plt.show()

Can check seaborn, D3.js, Bokeh, and ggplot (R port) for more visualization tools